{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "name": "#%% \n",
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('train_sub.csv')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "test_df = pd.read_csv('test_sub_count.csv')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "data": {
      "text/plain": "             id      hour    C1  banner_pos   site_id site_domain  \\\n0  1.000017e+19  14103100  1005           0  235ba823    f6ebf28e   \n1  1.000018e+19  14103100  1005           0  1fbe01fe    f3845767   \n2  1.000055e+19  14103100  1005           0  1fbe01fe    f3845767   \n3  1.000109e+19  14103100  1005           0  85f751fd    c4e18dd6   \n4  1.000138e+19  14103100  1005           0  85f751fd    c4e18dd6   \n\n  site_category    app_id app_domain app_category  ...  C19     C20  C21  \\\n0      f028772b  ecad2386   7801e8d9     07d7df22  ...  175  100075   23   \n1      28905ebd  ecad2386   7801e8d9     07d7df22  ...   35  100083   51   \n2      28905ebd  ecad2386   7801e8d9     07d7df22  ...   35  100083   51   \n3      50e219e0  51cedd4e   aefc06bd     0f2161f8  ...  809  100156   61   \n4      50e219e0  9c13b419   2347f47a     f95efa07  ...   47      -1  221   \n\n             user_id  user_id&media_id  user_id&C14_d  user_id&C17_d  \\\n0  69f45779_0eb711ec                 1              1              1   \n1  e8d44657_ecb851b2                 1              1              1   \n2  10fb085b_1f0bc64f                 1              1              1   \n3  422d257a_542422a7                 1              1              1   \n4  078c6b38_1f0bc64f                 1              1              1   \n\n   user_id&C14_h  user_id&C17_h  time  \n0              1              1    -1  \n1              1              1    -1  \n2              1              1    -1  \n3              1              1    -1  \n4              1              1    -1  \n\n[5 rows x 30 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>hour</th>\n      <th>C1</th>\n      <th>banner_pos</th>\n      <th>site_id</th>\n      <th>site_domain</th>\n      <th>site_category</th>\n      <th>app_id</th>\n      <th>app_domain</th>\n      <th>app_category</th>\n      <th>...</th>\n      <th>C19</th>\n      <th>C20</th>\n      <th>C21</th>\n      <th>user_id</th>\n      <th>user_id&amp;media_id</th>\n      <th>user_id&amp;C14_d</th>\n      <th>user_id&amp;C17_d</th>\n      <th>user_id&amp;C14_h</th>\n      <th>user_id&amp;C17_h</th>\n      <th>time</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1.000017e+19</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>235ba823</td>\n      <td>f6ebf28e</td>\n      <td>f028772b</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>07d7df22</td>\n      <td>...</td>\n      <td>175</td>\n      <td>100075</td>\n      <td>23</td>\n      <td>69f45779_0eb711ec</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1.000018e+19</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>07d7df22</td>\n      <td>...</td>\n      <td>35</td>\n      <td>100083</td>\n      <td>51</td>\n      <td>e8d44657_ecb851b2</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>1.000055e+19</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>07d7df22</td>\n      <td>...</td>\n      <td>35</td>\n      <td>100083</td>\n      <td>51</td>\n      <td>10fb085b_1f0bc64f</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>1.000109e+19</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>85f751fd</td>\n      <td>c4e18dd6</td>\n      <td>50e219e0</td>\n      <td>51cedd4e</td>\n      <td>aefc06bd</td>\n      <td>0f2161f8</td>\n      <td>...</td>\n      <td>809</td>\n      <td>100156</td>\n      <td>61</td>\n      <td>422d257a_542422a7</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>1.000138e+19</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>85f751fd</td>\n      <td>c4e18dd6</td>\n      <td>50e219e0</td>\n      <td>9c13b419</td>\n      <td>2347f47a</td>\n      <td>f95efa07</td>\n      <td>...</td>\n      <td>47</td>\n      <td>-1</td>\n      <td>221</td>\n      <td>078c6b38_1f0bc64f</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 30 columns</p>\n</div>"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 10
    }
   ],
   "source": [
    "test_df.head()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "data": {
      "text/plain": "             id  click      hour    C1  banner_pos   site_id site_domain  \\\n0  1.000009e+18      0  14102100  1005           0  1fbe01fe    f3845767   \n1  1.000017e+19      0  14102100  1005           0  1fbe01fe    f3845767   \n2  1.000037e+19      0  14102100  1005           0  1fbe01fe    f3845767   \n3  1.000064e+19      0  14102100  1005           0  1fbe01fe    f3845767   \n4  1.000068e+19      0  14102100  1005           1  fe8cc448    9166c161   \n\n  site_category    app_id app_domain  ... device_type device_conn_type    C14  \\\n0      28905ebd  ecad2386   7801e8d9  ...           1                2  15706   \n1      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   \n2      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   \n3      28905ebd  ecad2386   7801e8d9  ...           1                0  15706   \n4      0569f928  ecad2386   7801e8d9  ...           1                0  18993   \n\n   C15  C16   C17  C18  C19     C20  C21  \n0  320   50  1722    0   35      -1   79  \n1  320   50  1722    0   35  100084   79  \n2  320   50  1722    0   35  100084   79  \n3  320   50  1722    0   35  100084   79  \n4  320   50  2161    0   35      -1  157  \n\n[5 rows x 24 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>click</th>\n      <th>hour</th>\n      <th>C1</th>\n      <th>banner_pos</th>\n      <th>site_id</th>\n      <th>site_domain</th>\n      <th>site_category</th>\n      <th>app_id</th>\n      <th>app_domain</th>\n      <th>...</th>\n      <th>device_type</th>\n      <th>device_conn_type</th>\n      <th>C14</th>\n      <th>C15</th>\n      <th>C16</th>\n      <th>C17</th>\n      <th>C18</th>\n      <th>C19</th>\n      <th>C20</th>\n      <th>C21</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1.000009e+18</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>15706</td>\n      <td>320</td>\n      <td>50</td>\n      <td>1722</td>\n      <td>0</td>\n      <td>35</td>\n      <td>-1</td>\n      <td>79</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1.000017e+19</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>...</td>\n      <td>1</td>\n      <td>0</td>\n      <td>15704</td>\n      <td>320</td>\n      <td>50</td>\n      <td>1722</td>\n      <td>0</td>\n      <td>35</td>\n      <td>100084</td>\n      <td>79</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>1.000037e+19</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>...</td>\n      <td>1</td>\n      <td>0</td>\n      <td>15704</td>\n      <td>320</td>\n      <td>50</td>\n      <td>1722</td>\n      <td>0</td>\n      <td>35</td>\n      <td>100084</td>\n      <td>79</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>1.000064e+19</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>...</td>\n      <td>1</td>\n      <td>0</td>\n      <td>15706</td>\n      <td>320</td>\n      <td>50</td>\n      <td>1722</td>\n      <td>0</td>\n      <td>35</td>\n      <td>100084</td>\n      <td>79</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>1.000068e+19</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>1</td>\n      <td>fe8cc448</td>\n      <td>9166c161</td>\n      <td>0569f928</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>...</td>\n      <td>1</td>\n      <td>0</td>\n      <td>18993</td>\n      <td>320</td>\n      <td>50</td>\n      <td>2161</td>\n      <td>0</td>\n      <td>35</td>\n      <td>-1</td>\n      <td>157</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 24 columns</p>\n</div>"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 4
    }
   ],
   "source": [
    "train_df.head()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "outputs": [
    {
     "data": {
      "text/plain": "Index(['C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id',\n       'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model',\n       'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18',\n       'C19', 'C20', 'C21'],\n      dtype='object')"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 43
    }
   ],
   "source": [
    "train_df.columns[3:]"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [],
   "source": [
    "feat= []"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "for x in list(train_df.columns[3:]):\n",
    "    dd = train_df[x].value_counts()\n",
    "    for xx in dd.keys():\n",
    "        if dd[xx] >=10:\n",
    "            feat.append(x+'_'+str(xx))\n",
    "    "
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "for x in list(test_df.columns[2:]):\n",
    "    dd = test_df[x].value_counts()\n",
    "    for xx in dd.keys():\n",
    "        if dd[xx] >=10:\n",
    "            feat.append(x+'_'+str(xx))"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "data": {
      "text/plain": "94718"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 8
    }
   ],
   "source": [
    "len(feat)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "import pickle"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "pickle.dump(set(feat), open(\"fcount.pkl\", 'wb'))  "
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [],
   "source": [
    "pickle.dump(set(feat), open(\"fcount_test.pkl\", 'wb'))  \n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}